import os
import json
import re
import pathlib
import numpy as np
import pandas as pd
import cv2
import hvplot.pandas
import matplotlib.pyplot as plt
import tensorflow as tf
from collections import Counter
from pathlib import Path
data_path = Path('HKR_Dataset_Words_Public')
data_length = len(os.listdir(data_path.joinpath('ann')))
print(f'Number of objects in HKR_Dataset_Words_Public: {data_length}')
Number of objects in HKR_Dataset_Words_Public: 64943
def get_annotations(data_path, img_type='.jpg'):
columns = ['width', 'height', 'gt_text', 'file_path']
df = pd.DataFrame(columns=columns)
anns_path = data_path.joinpath('ann')
for ann_path in anns_path.iterdir():
with open(ann_path, encoding='utf8') as json_file:
ann = json.load(json_file)
width, height = ann['size'].values()
gt_text = ann['description']
file_path = ann['name'] + img_type
df.loc[len(df)] = [width, height, gt_text, file_path]
return df
ann_df = get_annotations(data_path)
ann_df[['height', 'width']] = ann_df[['height', 'width']].apply(lambda x : [float(i) for i in x])
ann_df.sample(10)
| width | height | gt_text | file_path | |
|---|---|---|---|---|
| 14273 | 528.0 | 82.0 | Атырауская | 10_30_115.jpg |
| 61399 | 595.0 | 75.0 | Снуют пунцовые | 9_0_583.jpg |
| 4321 | 366.0 | 123.0 | Казахстан | 10_0_58.jpg |
| 22566 | 226.0 | 89.0 | Атырау | 10_9_364.jpg |
| 34276 | 354.0 | 88.0 | джигит | 1_31_648.jpg |
| 26436 | 365.0 | 89.0 | Каражал | 12_17_134.jpg |
| 35796 | 619.0 | 99.0 | Конь догонит, | 1_4_106.jpg |
| 63705 | 841.0 | 100.0 | месяц мимоезжий, | 9_40_556.jpg |
| 63329 | 819.0 | 58.0 | ложится на полати | 9_36_547.jpg |
| 4976 | 206.0 | 49.0 | Костанай | 10_11_238.jpg |
ann_df['text_length'] = ann_df.gt_text.apply(len)
ann_df['pxls_per_chars'] = round(ann_df['width'] / ann_df['text_length'])
ann_df.describe().round(2)
| width | height | text_length | pxls_per_chars | |
|---|---|---|---|---|
| count | 64943.00 | 64943.00 | 64943.00 | 64943.00 |
| mean | 443.28 | 75.70 | 11.02 | 41.02 |
| std | 176.95 | 17.39 | 4.39 | 9.58 |
| min | 44.00 | 14.00 | 2.00 | 11.00 |
| 25% | 305.00 | 64.00 | 7.00 | 35.00 |
| 50% | 430.00 | 75.00 | 11.00 | 40.00 |
| 75% | 565.00 | 87.00 | 14.00 | 45.00 |
| max | 1697.00 | 150.00 | 42.00 | 475.00 |
ann_df.describe(include=object)
| gt_text | file_path | |
|---|---|---|
| count | 64943 | 64943 |
| unique | 2808 | 64943 |
| top | Актау | 0_0_0.jpg |
| freq | 484 | 1 |
plot_opts = dict(xlim=(0, 1200),
ylim=(0, 250),
grid=True,
xticks=[0, 200, 400, 600, 800, 1000, 1200],
yticks=[50, 100, 150, 200, 250],
height=400,
width=550
)
style_opts = dict(scaling_factor=0.2,
line_alpha=1,
fill_alpha=0.1
)
ann_df[['width', 'height']].hvplot.scatter(x='width', y='height', **plot_opts).options(**style_opts)
ann_df.hvplot.kde(y='text_length')
ann_df['gt_text'].value_counts(normalize=True)[:50].hvplot.bar(width=700,
height=450,
rot=60, line_alpha=0,
title='Label Frequencies',
ylabel='fraction of all objects')
print('Label frequencies:')
print(ann_df.gt_text.value_counts())
print(f'Average label frequency: {round(ann_df.gt_text.value_counts().mean())}')
Label frequencies:
Актау 484
Карагандинская 476
Казахстан 474
Шымкент 473
Кокшетау 469
...
свобода любить? 1
свободи любить? 1
внебесах я парить 1
в небесах я поршть 1
Вид подстоялого 1
Name: gt_text, Length: 2808, dtype: int64
Average label frequency: 23
all_chars_freq = Counter()
for descr in ann_df['gt_text']:
all_chars_freq += Counter(descr)
all_chars = sorted(list(all_chars_freq.keys()))
chars_df = pd.DataFrame(all_chars_freq.values(),
columns=['frequency'],
index=all_chars_freq.keys())
chars_df = chars_df.sort_values(['frequency'], ascending=False)
chars_df.hvplot.bar(width=920, height=450,
rot=0, line_alpha=0,
title='Character Frequencies',
ylabel='frequency')
print(f'All characters:\n {all_chars}')
All characters:
[' ', '!', '(', ')', ',', '-', '.', ':', ';', '?', 'H', 'o', 'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З', 'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', 'П', 'Р', 'С', 'Т', 'У', 'Ф', 'Х', 'Ч', 'Ш', 'Щ', 'Ы', 'Ь', 'Э', 'Ю', 'Я', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я', 'ё', 'ғ', 'Қ', 'қ', 'Ү', 'Ө', 'ө', '–', '—', '…']
def has_cyrillic(str):
return bool(re.search('[а-яА-ЯёЁ]', str))
kazakh_chars = [c for c in all_chars if not has_cyrillic(c) and c.isalpha()]
num_kazakh_labels = 0
for label in ann_df['gt_text']:
if any(char in label for char in kazakh_chars):
num_kazakh_labels += 1
print(f'Kazakh characters: {kazakh_chars}')
print(f'Number of Kazakh labels: {num_kazakh_labels}')
print(f'Percentage of Kazakh labels: {round(num_kazakh_labels/data_length*100, 2)}%')
Kazakh characters: ['H', 'o', 'ғ', 'Қ', 'қ', 'Ү', 'Ө', 'ө'] Number of Kazakh labels: 240 Percentage of Kazakh labels: 0.37%
def print_imgs_with_hist(df, size=5, width=15, height=20):
num_cols = 2
_, ax = plt.subplots(size, num_cols, figsize=(width, height))
i = 0
for _, row in df.sample(size).iterrows():
img_path = '/'.join([data_path.as_posix(), 'img', row['file_path']])
img = cv2.imread(img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
ax[i // num_cols, i % num_cols].imshow(img)
ax[i // num_cols, i % num_cols].set_title(f'label: {row.gt_text}')
color = ('b','g','r')
i += 1
for j, col in enumerate(color):
histr = cv2.calcHist([img], [j], None, [256], [0, 256])
ax[i // num_cols, i % num_cols].plot(histr, color = col)
ax[i // num_cols, i % num_cols].set_title('color histogram')
ax[i // num_cols, i % num_cols].set_xlim([0, 255])
ax[i // num_cols, i % num_cols].set_ylim([0, 2000])
i += 1
plt.show()
print(f'Color format: RGB')
print_imgs_with_hist(ann_df[['file_path', 'gt_text']])
Color format: RGB
extreme_df = pd.DataFrame()
for col in ann_df.select_dtypes(include=[np.number]):
df = ann_df[[col, 'file_path']].sort_values([col]).head()
extreme_df['min ' + col + ' file'] = df['file_path'].array
extreme_df['min ' + col] = df[col].array
df = ann_df[[col, 'file_path']].sort_values([col], ascending=False).head()
extreme_df['max ' + col + ' file'] = df['file_path'].array
extreme_df['max ' + col] = df[col].array
extreme_df
| min width file | min width | max width file | max width | min height file | min height | max height file | max height | min text_length file | min text_length | max text_length file | max text_length | min pxls_per_chars file | min pxls_per_chars | max pxls_per_chars file | max pxls_per_chars | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11_19_49.jpg | 44.0 | 7_42_419.jpg | 1697.0 | 10_20_136.jpg | 14.0 | 0_27_44.jpg | 150.0 | 1_28_646.jpg | 2 | 10_38_284.jpg | 42 | 11_27_63.jpg | 11.0 | 1_44_76.jpg | 475.0 |
| 1 | 11_19_41.jpg | 45.0 | 10_36_313.jpg | 1489.0 | 1_39_222.jpg | 19.0 | 4_26_279.jpg | 149.0 | 1_28_645.jpg | 2 | 10_35_307.jpg | 42 | 9_35_575.jpg | 18.0 | 1_44_113.jpg | 475.0 |
| 2 | 11_16_40_.jpg | 51.0 | 10_38_313_.jpg | 1477.0 | 1_37_222.jpg | 20.0 | 10_16_83_.jpg | 147.0 | 1_28_647.jpg | 2 | 8_36_842.jpg | 23 | 11_14_9.jpg | 18.0 | 1_28_76.jpg | 470.0 |
| 3 | 11_16_41.jpg | 53.0 | 10_35_313.jpg | 1406.0 | 1_14_222.jpg | 21.0 | 10_38_460.jpg | 146.0 | 1_28_648.jpg | 2 | 7_42_428.jpg | 23 | 9_37_575.jpg | 18.0 | 1_28_78.jpg | 446.0 |
| 4 | 1_44_108.jpg | 54.0 | 10_38_129_.jpg | 1300.0 | 9_45_869.jpg | 23.0 | 10_30_366.jpg | 146.0 | 1_28_649.jpg | 2 | 8_36_487.jpg | 23 | 9_47_575.jpg | 18.0 | 1_33_97.jpg | 374.0 |
def print_extreme_imgs(df, img_col, value_col, size=5):
for _, row in df[:size].iterrows():
img_path = '/'.join([data_path.as_posix(), 'img', row[img_col]])
img = cv2.imread(img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
width = min(img.shape[1]/20, 10)
height = min(img.shape[0]/20, 10)
title = value_col + ': ' + str(row[value_col])
plt.figure(figsize=(width, height))
plt.imshow(img)
plt.title(title)
plt.show()
for i in range(0, extreme_df.shape[1], 2):
img_col, value_col = extreme_df.iloc[:, i:i+2].columns
print_extreme_imgs(extreme_df, img_col, value_col)